Load necessary packages
library(caret) # Classification and Regression Training
## Warning: package 'caret' was built under R version 3.5.2
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.5.2
library(corrplot)
## corrplot 0.84 loaded
library(dplyr) # A Grammar of Data Manipulations
## Warning: package 'dplyr' was built under R version 3.5.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(forcats) # Tools for Working with Categorical Variables (Factors)
## Warning: package 'forcats' was built under R version 3.5.2
library(gdata) # Various R Programming Tools for Data Manipulation
## gdata: read.xls support for 'XLS' (Excel 97-2004) files ENABLED.
##
## gdata: read.xls support for 'XLSX' (Excel 2007+) files ENABLED.
##
## Attaching package: 'gdata'
## The following objects are masked from 'package:dplyr':
##
## combine, first, last
## The following object is masked from 'package:stats':
##
## nobs
## The following object is masked from 'package:utils':
##
## object.size
## The following object is masked from 'package:base':
##
## startsWith
library(ggplot2) # Create Elegant Data Visualizations Using the Grammar of Graphics
library(ggrepel) # Automatically Position Non-Overlapping Text Labels with ‘ggplot2’
## Warning: package 'ggrepel' was built under R version 3.5.2
library(ggridges) # Ridgeline Plots in ‘ggplot2’
##
## Attaching package: 'ggridges'
## The following object is masked from 'package:ggplot2':
##
## scale_discrete_manual
library(ggthemes) # Extra Themes, Scales, and Geoms for ‘ggplot2’
## Warning: package 'ggthemes' was built under R version 3.5.2
library(gmodels) # Various R Programming Tools for Model Fitting
library(grid) # The Grid Graphics Package
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:gdata':
##
## combine
## The following object is masked from 'package:dplyr':
##
## combine
library(knitr) # A General-Purpose Package for Dynamic Report Generation in R
## Warning: package 'knitr' was built under R version 3.5.2
library(lmtest) # Testing Linear Regression Models
## Warning: package 'lmtest' was built under R version 3.5.2
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(plyr) # Tools for Splitting, Applying and Combining Data
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
library(purrr) # Functional Programming Tools
## Warning: package 'purrr' was built under R version 3.5.2
##
## Attaching package: 'purrr'
## The following object is masked from 'package:plyr':
##
## compact
## The following object is masked from 'package:gdata':
##
## keep
## The following object is masked from 'package:caret':
##
## lift
library(readr) # Read Rectangular Text Data
library(rpart)
## Warning: package 'rpart' was built under R version 3.5.2
library(rattle)
## Rattle: A free graphical interface for data science with R.
## Version 5.2.0 Copyright (c) 2006-2018 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
library(scales) # Scale Functions for Visualization
##
## Attaching package: 'scales'
## The following object is masked from 'package:readr':
##
## col_factor
## The following object is masked from 'package:purrr':
##
## discard
library(shiny) # Web Application Framework for R
## Warning: package 'shiny' was built under R version 3.5.2
library(skimr) # Compact and Flexible Summaries of Data
## Warning: package 'skimr' was built under R version 3.5.2
##
## Attaching package: 'skimr'
## The following object is masked from 'package:knitr':
##
## kable
## The following object is masked from 'package:stats':
##
## filter
library(stringr) # Simple, Consistent Wrappers for Common String Operations
library(tibble) # Simple Data Frames
## Warning: package 'tibble' was built under R version 3.5.2
library(tidyr) # Easily Tidy Data with ‘spread()’ and ‘gather()’ Functions
## Warning: package 'tidyr' was built under R version 3.5.2
library(tidyverse) # Install and Load ‘Tidyverse’ Packages
library(usmap)
## Warning: package 'usmap' was built under R version 3.5.2
library(rworldmap)
## Loading required package: sp
## ### Welcome to rworldmap ###
## For a short introduction type : vignette('rworldmap')
library(ggmap)
## Warning: package 'ggmap' was built under R version 3.5.2
## Google's Terms of Service: https://cloud.google.com/maps-platform/terms/.
## Please cite ggmap if you use it! See citation("ggmap") for details.
library(maps)
##
## Attaching package: 'maps'
## The following object is masked from 'package:purrr':
##
## map
## The following object is masked from 'package:plyr':
##
## ozone
library(mapdata)
library(cowplot)
## Warning: package 'cowplot' was built under R version 3.5.2
##
## ********************************************************
## Note: As of version 1.0.0, cowplot does not change the
## default ggplot2 theme anymore. To recover the previous
## behavior, execute:
## theme_set(theme_cowplot())
## ********************************************************
##
## Attaching package: 'cowplot'
## The following object is masked from 'package:ggmap':
##
## theme_nothing
## The following object is masked from 'package:ggthemes':
##
## theme_map
library("googleway")
library("ggspatial")
library("rnaturalearth")
library("rnaturalearthdata")
theme_set(theme_bw())
library("sf")
## Warning: package 'sf' was built under R version 3.5.2
## Linking to GEOS 3.7.2, GDAL 2.4.2, PROJ 5.2.0
library(gridExtra)
Creating dfs of color schemes I may use
state_colors <- c("brown1","lightseagreen","goldenrod1","slateblue")
species_colors <- c("royalblue","salmon","darkmagenta")
pisaster_colors <- c("thistle1","plum","orchid3","darkmagenta")
RMSE_colors <- c("slategray1","steelblue3","thistle2","plum3")
bioreg_colours <- c("brown3","goldenrod1","darkolivegreen4","skyblue3","mediumpurple3","hotpink3")
Load cleaned data
#clean_data <- readRDS("./data/processed_data/processeddata.rds")
clean_data <- readRDS("../../data/processed_data/processeddata.rds")
skim(clean_data)
## Skim summary statistics
## n obs: 13165
## n variables: 23
##
## ── Variable type:factor ───────────────────────────────────────────────
## variable missing complete n n_unique
## bioregion 0 13165 13165 6
## georegion 0 13165 13165 9
## group_code 0 13165 13165 9
## group_code_UCSC_other 0 13165 13165 2
## island 0 13165 13165 6
## marine_season_code 0 13165 13165 57
## marine_site_name 0 13165 13165 77
## method_code 0 13165 13165 4
## method_code_IP_other 0 13165 13165 2
## mpa_region 0 13165 13165 6
## season_name 0 13165 13165 4
## site_code 0 13165 13165 77
## species_code 0 13165 13165 3
## state 0 13165 13165 4
## top_counts ordered
## San: 5608, Oly: 4138, Gov: 2366, Sal: 652 FALSE
## CA : 5382, CA : 2592, CA : 1660, OR: 1386 FALSE
## UCS: 9657, UCL: 2032, PBN: 400, SSS: 366 FALSE
## UCS: 9657, Oth: 3508, NA: 0 FALSE
## Mai: 12364, Bar: 279, Sad: 250, Hat: 150 FALSE
## SU1: 440, SU1: 403, SU1: 392, FA1: 376 FALSE
## Mil: 524, Sco: 513, Sti: 472, End: 468 FALSE
## IP: 12012, BT2: 782, GSE: 336, TS3: 35 FALSE
## IP: 12012, Oth: 1153, NA: 0 FALSE
## Cen: 5382, Sou: 2627, NUL: 1932, Nor: 1660 FALSE
## Sum: 4552, Fal: 4489, Spr: 4112, Win: 12 FALSE
## MCR: 524, SCT: 513, SWC: 472, END: 468 FALSE
## P.o: 11416, K FALSE
## CA: 10548, OR: 1386, WA: 865, AK: 366 FALSE
##
## ── Variable type:integer ──────────────────────────────────────────────
## variable missing complete n mean sd p0 p25
## marine_common_season 0 13165 13165 116.82 19.23 78 101
## marine_common_year 0 13165 13165 2009.7 4.81 2000 2006
## marine_sort_order 0 13165 13165 5566.24 1125.15 1720 5020
## season_sequence 0 13165 13165 2.03 0.81 1 1
## size_bin 0 13165 13165 82.44 42.2 5 50
## size_sort_order 0 13165 13165 9.57 4.15 1 6
## total 0 13165 13165 9.92 16.43 1 2
## p50 p75 p100 hist
## 118 131 152 ▅▅▆▇▇▇▇▅
## 2010 2013 2018 ▃▅▅▇▇▆▆▆
## 6090 6370 7650 ▁▁▁▂▃▃▇▁
## 2 3 4 ▇▁▇▁▁▇▁▁
## 80 110 320 ▅▇▇▃▁▁▁▁
## 10 12 33 ▅▇▇▃▁▁▁▁
## 4 11 360 ▇▁▁▁▁▁▁▁
##
## ── Variable type:numeric ──────────────────────────────────────────────
## variable missing complete n mean sd p0 p25 p50
## latitude 0 13165 13165 38.67 5.2 32.87 34.73 36.62
## longitude 0 13165 13165 -122.25 2.86 -135.38 -123.98 -121.94
## p75 p100 hist
## 41.65 57.05 ▇▆▃▂▁▁▁▁
## -120.62 -117.25 ▁▁▁▁▅▇▆▃
names(clean_data)
## [1] "group_code" "site_code"
## [3] "marine_site_name" "marine_sort_order"
## [5] "latitude" "longitude"
## [7] "marine_common_season" "marine_season_code"
## [9] "marine_common_year" "season_sequence"
## [11] "season_name" "method_code"
## [13] "species_code" "size_sort_order"
## [15] "size_bin" "total"
## [17] "mpa_region" "georegion"
## [19] "bioregion" "island"
## [21] "group_code_UCSC_other" "method_code_IP_other"
## [23] "state"
summary(clean_data$bioregion)
## AK_2_BritColumb ChannelIsl_South GovtPt_2_Mexico
## 366 35 2366
## OlyCstWA_2_SanFran SalishSea_WA SanFran_2_GovtPt
## 4138 652 5608
bioregion_counts <- clean_data %>% mutate(bioregion = recode(bioregion, "AK_2_BritColumb"="AK to \n British \n Columbia","ChannelIsl_South"="Channel \n Islands \n South","SalishSea_WA"="Salish \n Sea, WA","OlyCstWA_2_SanFran"="Olympic \n Coast WA \n to SanFran","SanFran_2_GovtPt"="SanFran to \n Gov't Point","GovtPt_2_Mexico"="Gov't Point \n to Mexico"))
skim(bioregion_counts$bioregion)
##
## Skim summary statistics
##
## ── Variable type:factor ───────────────────────────────────────────────
## variable missing complete n n_unique
## bioregion_counts$bioregion 0 13165 13165 6
## top_counts ordered
## San: 5608, Oly: 4138, Gov: 2366, Sal: 652 FALSE
bioregion_counts_ordered <- bioregion_counts %>%
mutate(bioregion = factor(bioregion, levels = c("AK to \n British \n Columbia","Salish \n Sea, WA","Olympic \n Coast WA \n to SanFran","SanFran to \n Gov't Point","Channel \n Islands \n South","Gov't Point \n to Mexico"))) %>%
arrange(bioregion)
bioregion_histogram <- bioregion_counts_ordered %>% ggplot() + geom_bar(aes(bioregion),fill=bioreg_colours) #+ theme(axis.text.x = element_text(angle=25, hjust=1))
bioregion_histogram
ggsave(filename = "../../results/Categorical_Outcome_Modeling_results/1.bioregion_histogram.png",plot =bioregion_histogram , width = 5, height = 4)
bioregion_counts_ordered$season_sequence <- as.factor(bioregion_counts_ordered$season_sequence)
numer_data <- bioregion_counts_ordered %>% select_if(is.numeric)
numer_data$bioregion <- bioregion_counts_ordered$bioregion
numer_data <- numer_data %>% dplyr::select(-size_sort_order)
numer_data_plot <- numer_data %>% gather(-bioregion, key = "var", value = "value") %>%
ggplot() + geom_point(aes(x = bioregion, y=value, col=bioregion)) + theme(axis.text.x = element_blank()) +
facet_wrap(~ var, scales = "free") + theme_bw() + theme(legend.position = "none", axis.title.x = element_blank()) + scale_color_manual(values=bioreg_colours)
print(numer_data_plot)
latitude_bioreg <- numer_data %>% ggplot(aes(x = bioregion, y=latitude, col=bioregion)) + geom_boxplot() + geom_point(size=4,alpha=0.1) + ylab("Latitude") + theme(axis.text.x = element_blank(),axis.title.x=element_blank(),legend.position = "none",axis.title.y=element_text(size=12),axis.text.y=element_text(size=12)) + scale_color_manual(values=bioreg_colours)
#latitude_bioreg
longitude_bioreg <- numer_data %>% ggplot(aes(x = bioregion, y=longitude, col=bioregion)) + geom_boxplot() + geom_point(size=4,alpha=0.1) + ylab("Longitude") + theme(axis.text.x = element_blank(),axis.title.x=element_blank(),legend.position = "none",axis.title.y=element_text(size=12),axis.text.y=element_text(size=12)) + scale_color_manual(values=bioreg_colours)
#longitude_bioreg
season_chron_bioreg <- numer_data %>% ggplot(aes(x = bioregion, y=marine_common_season, col=bioregion)) + geom_boxplot() + geom_point(size=4,alpha=0.1) + ylab("Season & Year") + theme(axis.text.x = element_blank(),axis.title.x=element_blank(),legend.position = "none",axis.title.y=element_text(size=12),axis.text.y=element_text(size=12)) + scale_color_manual(values=bioreg_colours)
#season_chron_bioreg
year_bioreg <- numer_data %>% ggplot(aes(x = bioregion, y=marine_common_year, col=bioregion)) + geom_boxplot() + geom_point(size=4,alpha=0.1) + ylab("Year") + theme(axis.text.x = element_blank(),axis.title.x=element_blank(),legend.position = "none",axis.title.y=element_text(size=12),axis.text.y=element_text(size=12)) + scale_color_manual(values=bioreg_colours)
#year_bioreg
site_bioreg <- numer_data %>% ggplot(aes(x = bioregion, y=marine_sort_order, col=bioregion)) + geom_boxplot() + geom_point(size=4,alpha=0.1) + ylab("Sites (Ordered)") + theme(axis.text.x = element_blank(),axis.title.x=element_blank(),legend.position = "none",axis.title.y=element_text(size=12),axis.text.y=element_text(size=12)) + scale_color_manual(values=bioreg_colours)
#site_bioreg
size_bioreg <- numer_data %>% ggplot(aes(x = bioregion, y=size_bin, col=bioregion)) + geom_boxplot() + geom_point(size=4,alpha=0.1) + ylab("Size Class") + theme(axis.text.x = element_blank(),axis.title.x=element_blank(),legend.position = "none",axis.title.y=element_text(size=12),axis.text.y=element_text(size=12)) + scale_color_manual(values=bioreg_colours)
#size_bioreg
abundance_bioreg <- numer_data %>% ggplot(aes(x = bioregion, y=log(total), col=bioregion)) + geom_boxplot() + geom_point(size=4,alpha=0.1) + ylab("log(Spp. Abundance)") + theme(axis.text.x = element_blank(),axis.title.x=element_blank(),legend.position = "none",axis.title.y=element_text(size=12),axis.text.y=element_text(size=12)) + scale_color_manual(values=bioreg_colours)
#abundance_bioreg
grid.arrange(latitude_bioreg,longitude_bioreg,site_bioreg,nrow=1,top="(A) Spatial Numeric Predictors")
ggsave(filename = "../../results/Categorical_Outcome_Modeling_results/2.spatial_num_pred.png",plot =grid.arrange(latitude_bioreg,longitude_bioreg,site_bioreg,nrow=1,top="(A) Spatial Numeric Predictors")
, width = 5, height = 4)
grid.arrange(season_chron_bioreg,year_bioreg,nrow=1,top="(B) Temporal Numeric Predictors")
ggsave(filename = "../../results/Categorical_Outcome_Modeling_results/3.temporal_num_pred.png",plot =grid.arrange(season_chron_bioreg,year_bioreg,nrow=1,top="(B) Temporal Numeric Predictors"))
## Saving 7 x 5 in image
grid.arrange(abundance_bioreg,size_bioreg,nrow=1,top="(C) Spp.-Related Numeric Predictors")
ggsave(filename = "../../results/Categorical_Outcome_Modeling_results/4.spp.rel_num_pred.png",plot =grid.arrange(abundance_bioreg,size_bioreg,nrow=1,top="(C) Spp.-Related Numeric Predictors"))
## Saving 7 x 5 in image
#grid.arrange(latitude_bioreg,longitude_bioreg,season_chron_bioreg,year_bioreg,site_bioreg,size_bioreg,abundance_bioreg,nrow=3)
categ_data <- bioregion_counts_ordered %>% select_if(is.factor)
categ_data <- categ_data %>% dplyr::select(-marine_site_name,-season_sequence,-marine_season_code)
names(categ_data)
## [1] "group_code" "site_code"
## [3] "season_name" "method_code"
## [5] "species_code" "mpa_region"
## [7] "georegion" "bioregion"
## [9] "island" "group_code_UCSC_other"
## [11] "method_code_IP_other" "state"
categ_data_plotting <- categ_data %>% dplyr::select(-method_code_IP_other,-group_code_UCSC_other)
groupcode_bioreg <- categ_data_plotting %>% ggplot(aes(x = bioregion, y=group_code, col=bioregion)) + geom_jitter(size=4,alpha=0.15,width=0.4,height=0.25) + ylab("Surveying Group") + theme(axis.text.x = element_blank(),axis.title.x=element_blank(),legend.position = "none",axis.title.y=element_text(size=12),axis.text.y=element_text(size=12)) + scale_color_manual(values=bioreg_colours)
#groupcode_bioreg
sitecode_bioreg <- categ_data_plotting %>% ggplot(aes(x = bioregion, y=site_code, col=bioregion)) + geom_jitter(size=4,alpha=0.15,width=0.4,height=0.25) + ylab("Site") + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),axis.text.x = element_blank(),axis.title.x=element_blank(),axis.text.y = element_blank(),legend.position = "none",axis.title.y=element_text(size=12))+ scale_color_manual(values=bioreg_colours)
#sitecode_bioreg
season_bioreg <- categ_data_plotting %>% ggplot(aes(x = bioregion, y=season_name, col=bioregion)) + geom_jitter(size=4,alpha=0.15,width=0.4,height=0.25) + ylab("Season") + theme(axis.text.x = element_blank(),axis.title.x=element_blank(),legend.position = "none",axis.title.y=element_text(size=12),axis.text.y=element_text(size=12))+ scale_color_manual(values=bioreg_colours)
#season_bioreg
method_bioreg <- categ_data_plotting %>% ggplot(aes(x = bioregion, y=method_code, col=bioregion)) + geom_jitter(size=4,alpha=0.15,width=0.4,height=0.25) + ylab("Sampling Method") + theme(axis.text.x = element_blank(),axis.title.x=element_blank(),legend.position = "none",axis.title.y=element_text(size=12),axis.text.y=element_text(size=12))+ scale_color_manual(values=bioreg_colours)
#method_bioreg
species_bioreg <- categ_data_plotting %>% ggplot(aes(x = bioregion, y=species_code, col=bioregion)) + geom_jitter(size=4,alpha=0.15,width=0.4,height=0.25) + ylab("Species") + theme(axis.text.x = element_blank(),axis.title.x=element_blank(),legend.position = "none",axis.title.y=element_text(size=12),axis.text.y=element_text(size=12))+ scale_color_manual(values=bioreg_colours)
#species_bioreg
mpareg_bioreg <- categ_data_plotting %>% ggplot(aes(x = bioregion, y=mpa_region, col=bioregion)) + geom_jitter(size=4,alpha=0.15,width=0.4,height=0.25) + ylab("MPA Region") + theme(axis.text.x = element_blank(),axis.title.x=element_blank(),legend.position = "none",axis.title.y=element_text(size=12),axis.text.y=element_text(size=12))+ scale_color_manual(values=bioreg_colours)
#mpareg_bioreg
georegion_bioreg <- categ_data_plotting %>% ggplot(aes(x = bioregion, y=georegion, col=bioregion)) + geom_jitter(size=4,alpha=0.15,width=0.4,height=0.25) + ylab("Georegion") + theme(axis.text.x = element_blank(),axis.title.x=element_blank(),legend.position = "none",axis.title.y=element_text(size=12),axis.text.y=element_text(size=12))+ scale_color_manual(values=bioreg_colours)
#georegion_bioreg
island_bioreg <- categ_data_plotting %>% ggplot(aes(x = bioregion, y=island, col=bioregion)) + geom_jitter(size=4,alpha=0.15,width=0.4,height=0.25) + ylab("Island Location?") + theme(axis.text.x = element_blank(),axis.title.x=element_blank(),legend.position = "none",axis.title.y=element_text(size=12),axis.text.y=element_text(size=12))+ scale_color_manual(values=bioreg_colours)
#island_bioreg
state_bioreg <- categ_data_plotting %>% ggplot(aes(x = bioregion, y=state, col=bioregion)) + geom_jitter(size=4,alpha=0.15,width=0.4,height=0.25) + ylab("State") + theme(axis.text.x = element_blank(),axis.title.x=element_blank(),legend.position = "none",axis.title.y=element_text(size=12),axis.text.y=element_text(size=12))+ scale_color_manual(values=bioreg_colours)
#state_bioreg
grid.arrange(groupcode_bioreg,method_bioreg,species_bioreg,nrow=3,top="(A) Sample Info Categorical Predictors")
ggsave(filename = "../../results/Categorical_Outcome_Modeling_results/5.sampleinfo_cat_pred.png",plot =grid.arrange(groupcode_bioreg,method_bioreg,species_bioreg,nrow=3,top="(A) Sample Info Categorical Predictors"))
## Saving 7 x 5 in image
grid.arrange(state_bioreg,georegion_bioreg,sitecode_bioreg,nrow=3,top="(B) Spatial Categorical Predictors")
ggsave(filename = "../../results/Categorical_Outcome_Modeling_results/6.spatial_cat_pred.png",plot =grid.arrange(state_bioreg,georegion_bioreg,sitecode_bioreg,nrow=3,top="(B) Spatial Categorical Predictors"))
## Saving 7 x 5 in image
grid.arrange(season_bioreg,mpareg_bioreg,island_bioreg,nrow=3,top="(C) Ecological Categorical Predictors")
ggsave(filename = "../../results/Categorical_Outcome_Modeling_results/7.eco_cat_pred.png",plot =grid.arrange(season_bioreg,mpareg_bioreg,island_bioreg,nrow=3,top="(C) Ecological Categorical Predictors"))
## Saving 7 x 5 in image
#grid.arrange(groupcode_bioreg,sitecode_bioreg,season_bioreg,method_bioreg,species_bioreg,mpareg_bioreg,georegion_bioreg,island_bioreg,state_bioreg)
Making sure Bioregion is first
#names(clean_data) # bioregion is 19, 23 total, get rid of 3 and 14
clean_data_final <- clean_data[c(19,1,2,4,5,6,7,8,9,10,11,12,13,15,16,17,18,20,21,22,23)]
names(clean_data_final) # double checking
## [1] "bioregion" "group_code"
## [3] "site_code" "marine_sort_order"
## [5] "latitude" "longitude"
## [7] "marine_common_season" "marine_season_code"
## [9] "marine_common_year" "season_sequence"
## [11] "season_name" "method_code"
## [13] "species_code" "size_bin"
## [15] "total" "mpa_region"
## [17] "georegion" "island"
## [19] "group_code_UCSC_other" "method_code_IP_other"
## [21] "state"
#write code that splits data into 70/30 train/test
#call the 2 parts data_train and data_test
library(caret)
set.seed(123)
trainset_treefit <- caret::createDataPartition(y = clean_data_final$bioregion, p = 0.7, list = FALSE)
data_train_treefit = clean_data_final[trainset_treefit,] #extract observations/rows for training, assign to new variable
data_test_treefit = clean_data_final[-trainset_treefit,] #do the same for the test set
library("mlr")
## Warning: package 'mlr' was built under R version 3.5.2
## Loading required package: ParamHelpers
## Warning: package 'ParamHelpers' was built under R version 3.5.2
##
## Attaching package: 'mlr'
## The following object is masked from 'package:gdata':
##
## resample
## The following object is masked from 'package:caret':
##
## train
null_accuracy_treefit <- measureACC("SanFran_2_GovtPt", clean_data_final$bioregion)
null_accuracy_treefit
## [1] 0.425978
#There is probably a nicer tidyverse way of doing this. I just couldn't think of it, so did it this way.
set.seed(1111) #makes each code block reproducible
outcomename = "bioregion"
fitControl <- trainControl(method="repeatedcv",number=5,repeats=5) #setting CV method for caret
Npred <- ncol(data_train_treefit)-1 # number of predictors
resultmat <- data.frame(Variable = names(data_train_treefit)[-1], Accuracy = rep(0,Npred)) #store performance for each variable
for (n in 2:ncol(data_train_treefit)) #loop over each predictor. For this to work, outcome must be in 1st column
{
fit1 <- caret::train( as.formula(paste(outcomename, "~",names(data_train_treefit)[n])), data = data_train_treefit, method = "rpart", trControl = fitControl, na.action = na.pass, tuneLength = 10)
resultmat[n-1,2]= max(fit1$results$Accuracy)
}
print(resultmat)
## Variable Accuracy
## 1 group_code 0.6853236
## 2 site_code 0.7972035
## 3 marine_sort_order 1.0000000
## 4 latitude 0.9997831
## 5 longitude 1.0000000
## 6 marine_common_season 0.7002923
## 7 marine_season_code 0.6245137
## 8 marine_common_year 0.4435607
## 9 season_sequence 0.6970392
## 10 season_name 0.6970383
## 11 method_code 0.5139393
## 12 species_code 0.4583143
## 13 size_bin 0.4284417
## 14 total 0.4427816
## 15 mpa_region 0.9036776
## 16 georegion 0.9838376
## 17 island 0.4861698
## 18 group_code_UCSC_other 0.6034276
## 19 method_code_IP_other 0.5112258
## 20 state 0.6080918
null_accuracy_treefit <- 0.426
library(knitr)
library(kableExtra)
## Warning: package 'kableExtra' was built under R version 3.5.2
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:skimr':
##
## kable
## The following object is masked from 'package:dplyr':
##
## group_rows
Variable <- c("size_bin","total","marine_common_year","species_code","island","method_code_IP_other","method_code","group_code_UCSC_other","state","marine_season_code","group_code","season_name","season_sequence","marine_common_season","site_code","mpa_region","georegion","latitude", "longitude","marine_sort_order")
Accuracy <- c(0.43,0.44,0.44,0.46,0.49,0.51,0.51,0.60,0.61,0.62,0.69,0.70,0.70,0.70,0.80,0.90,0.98,1.00,1.00,1.00)
Singlepred_df <- data.frame(Variable,Accuracy)
Singlepred_df
## Variable Accuracy
## 1 size_bin 0.43
## 2 total 0.44
## 3 marine_common_year 0.44
## 4 species_code 0.46
## 5 island 0.49
## 6 method_code_IP_other 0.51
## 7 method_code 0.51
## 8 group_code_UCSC_other 0.60
## 9 state 0.61
## 10 marine_season_code 0.62
## 11 group_code 0.69
## 12 season_name 0.70
## 13 season_sequence 0.70
## 14 marine_common_season 0.70
## 15 site_code 0.80
## 16 mpa_region 0.90
## 17 georegion 0.98
## 18 latitude 1.00
## 19 longitude 1.00
## 20 marine_sort_order 1.00
Singlepred_df_ordered <- Singlepred_df %>%
mutate(Variable = factor(Variable, levels = c("size_bin","total","marine_common_year","species_code","island","method_code_IP_other","method_code","group_code_UCSC_other","state","marine_season_code","group_code","season_name","season_sequence","marine_common_season","site_code","mpa_region","georegion","latitude", "longitude","marine_sort_order"))) %>%
arrange(Variable)
# vertical line conditioning
lvls <- levels(Singlepred_df_ordered$Variable)
vline.level <- 'site_code' # you want to draw line here, right before 18
#geom_vline(aes(xintercept = myLoc))
# plotting
Single_Predictor_Accuracy <- Singlepred_df_ordered %>% ggplot(aes(x = Variable, y=Accuracy, label =Accuracy)) + geom_hline(yintercept=null_accuracy_treefit,color='red', size=2,linetype="dashed") + ylim(0.35,1) + geom_text(stat='identity',color="black", size=5,fill="lightgrey",angle=90) + theme(title = element_text(size=12), axis.title.y = element_text(size=14),axis.title.x = element_blank(),axis.text.y = element_text(size=12), axis.text.x = element_text(size=8,angle=40,hjust=1))
## Warning: Ignoring unknown parameters: fill
Single_Predictor_Accuracy
ggsave(filename = "../../results/Categorical_Outcome_Modeling_results/8.single_predictor_accuracy.png",plot = Single_Predictor_Accuracy, width = 6, height = 4)
bioregion, and for very obvious reasons. I removed variables in 2 ways: one was be more conservative (aka I removed those with above a 90% accuracy- calling it data_subset_A) and one will be less conservative (aka I removed those with above a 70% accuracy- calling it data_subset_B). I ended up deciding subset B would be the most useful moving forward, so I deleted the portion of this code that analyzed subset A to make the script run faster.data_subsetB <- clean_data_final %>% dplyr::select(-marine_sort_order, -latitude, -longitude, -mpa_region, -georegion, -site_code, -marine_common_season)
#write code that splits data into 70/30 train/test
#call the 2 parts data_train and data_test
library(caret)
set.seed(123)
trainset_subsetB <- caret::createDataPartition(y = data_subsetB$bioregion, p = 0.7, list = FALSE)
data_train_subsetB = data_subsetB[trainset_subsetB,] #extract observations/rows for training, assign to new variable
data_test_subsetB = data_subsetB[-trainset_subsetB,] #do the same for the test set
library("mlr")
null_accuracy_train_subsetB <- measureACC("SanFran_2_GovtPt", data_train_subsetB$bioregion)
null_accuracy_train_subsetB
## [1] 0.4258596
#There is probably a nicer tidyverse way of doing this. I just couldn't think of it, so did it this way.
set.seed(1111) #makes each code block reproducible
outcomename = "bioregion"
fitControl <- trainControl(method="repeatedcv",number=5,repeats=5) #setting CV method for caret
Npred <- ncol(data_train_subsetB)-1 # number of predictors
resultmat <- data.frame(Variable = names(data_train_subsetB)[-1], Accuracy = rep(0,Npred)) #store performance for each variable
for (n in 2:ncol(data_train_subsetB)) #loop over each predictor. For this to work, outcome must be in 1st column
{
fit1_subsetB <- caret::train(as.formula(paste(outcomename, "~",names(data_train_subsetB)[n])), data = data_train_subsetB, method = "rpart", trControl = fitControl, na.action = na.pass, tuneLength = 10)
resultmat[n-1,2]= max(fit1_subsetB$results$Accuracy)
}
print(resultmat)
## Variable Accuracy
## 1 group_code 0.6853236
## 2 marine_season_code 0.6255127
## 3 marine_common_year 0.4434338
## 4 season_sequence 0.6970379
## 5 season_name 0.6970384
## 6 method_code 0.5139386
## 7 species_code 0.4583582
## 8 size_bin 0.4283541
## 9 total 0.4440185
## 10 island 0.4861701
## 11 group_code_UCSC_other 0.6034279
## 12 method_code_IP_other 0.5112258
## 13 state 0.6080911
Now let’s fit a full logistic model with all predictors.
set.seed(1111) #makes each code block reproducible
fitControl <- trainControl(method="repeatedcv",number=5,repeats=5)
fit1_subsetB = caret::train(bioregion ~ ., data=data_train_subsetB, method="rpart", trControl = fitControl, na.action = na.pass, tuneLength = 10)
print(fit1_subsetB$results)
## cp Accuracy Kappa AccuracySD KappaSD
## 1 0.001448454 0.9559813 0.9353351 0.005539782 0.008196510
## 2 0.001511430 0.9558728 0.9351623 0.005431082 0.008033431
## 3 0.003211789 0.9541595 0.9326107 0.005192102 0.007676062
## 4 0.004723219 0.9522712 0.9298251 0.005571414 0.008246573
## 5 0.007746080 0.9489743 0.9249124 0.006593184 0.009792540
## 6 0.017286983 0.9352643 0.9041527 0.012654225 0.019132717
## 7 0.044020404 0.9124214 0.8700696 0.011311635 0.016670342
## 8 0.045342906 0.8879650 0.8317558 0.014722759 0.023893200
## 9 0.310409975 0.7822478 0.6516885 0.089881304 0.154096737
## 10 0.472321935 0.5865451 0.2998710 0.133975385 0.249913555
library(rpart.plot)
prp(fit1_subsetB$finalModel, extra = 1, type = 1)
set.seed(1111) #makes each code block reproducible
tuning_grid <- expand.grid( .mtry = seq(1,7,by=1), .splitrule = "gini", .min.node.size = seq(2,8,by=1) )
fit2 = caret::train(bioregion ~ ., data=data_train_subsetB, method="ranger", trControl = fitControl, tuneGrid = tuning_grid, na.action = na.pass)
plot(fit2)
library(gbm)
## Warning: package 'gbm' was built under R version 3.5.2
## Loaded gbm 2.1.5
gbmGrid <- expand.grid(interaction.depth = seq(1, 7, by = 2), n.trees = 10, shrinkage = c(0.1, 0.01), n.minobsinnode = c(2,4,6))
fit3 = caret::train(bioregion ~ ., data=data_train_subsetB, method="gbm", trControl = fitControl, verbose=FALSE, tuneGrid = gbmGrid)
plot(fit3)
# copy the random forest code from above. Add a statement to the train() function that centers and scales predictors.
# save the result as fit4.
set.seed(1111) #makes each code block reproducible
tuning_grid <- expand.grid( .mtry = seq(1,7,by=1), .splitrule = "gini", .min.node.size = seq(2,8,by=1) )
fit4 = caret::train(bioregion ~ ., preProcess = c("center", "scale"), data=data_train_subsetB, method="ranger", trControl = fitControl, tuneGrid = tuning_grid, na.action = na.pass)
plot(fit4)
#write code that trains a pda model, use tuneLength 20. Save as fit5 and plot.
set.seed(1111) #makes each code block reproducible
tuning_grid <- expand.grid(lambda = seq(0,1,by=0.05))
fit5 = caret::train(bioregion ~ ., data=data_train_subsetB, method="pda", trControl = fitControl, tuneGrid = tuning_grid, na.action = na.pass, tuneLength = 20)
## Warning: predictions failed for Fold1.Rep1: lambda=0.00 Error in mindist[l] <- ndist[l] :
## NAs are not allowed in subscripted assignments
## Warning: predictions failed for Fold2.Rep1: lambda=0.00 Error in mindist[l] <- ndist[l] :
## NAs are not allowed in subscripted assignments
## Warning: predictions failed for Fold3.Rep1: lambda=0.00 Error in mindist[l] <- ndist[l] :
## NAs are not allowed in subscripted assignments
## Warning: predictions failed for Fold4.Rep1: lambda=0.00 Error in mindist[l] <- ndist[l] :
## NAs are not allowed in subscripted assignments
## Warning: predictions failed for Fold5.Rep1: lambda=0.00 Error in mindist[l] <- ndist[l] :
## NAs are not allowed in subscripted assignments
## Warning: predictions failed for Fold1.Rep2: lambda=0.00 Error in mindist[l] <- ndist[l] :
## NAs are not allowed in subscripted assignments
## Warning: predictions failed for Fold2.Rep2: lambda=0.00 Error in mindist[l] <- ndist[l] :
## NAs are not allowed in subscripted assignments
## Warning: predictions failed for Fold3.Rep2: lambda=0.00 Error in mindist[l] <- ndist[l] :
## NAs are not allowed in subscripted assignments
## Warning: predictions failed for Fold4.Rep2: lambda=0.00 Error in mindist[l] <- ndist[l] :
## NAs are not allowed in subscripted assignments
## Warning: predictions failed for Fold5.Rep2: lambda=0.00 Error in mindist[l] <- ndist[l] :
## NAs are not allowed in subscripted assignments
## Warning: predictions failed for Fold1.Rep3: lambda=0.00 Error in mindist[l] <- ndist[l] :
## NAs are not allowed in subscripted assignments
## Warning: predictions failed for Fold2.Rep3: lambda=0.00 Error in mindist[l] <- ndist[l] :
## NAs are not allowed in subscripted assignments
## Warning: predictions failed for Fold3.Rep3: lambda=0.00 Error in mindist[l] <- ndist[l] :
## NAs are not allowed in subscripted assignments
## Warning: predictions failed for Fold4.Rep3: lambda=0.00 Error in mindist[l] <- ndist[l] :
## NAs are not allowed in subscripted assignments
## Warning: predictions failed for Fold5.Rep3: lambda=0.00 Error in mindist[l] <- ndist[l] :
## NAs are not allowed in subscripted assignments
## Warning: predictions failed for Fold1.Rep4: lambda=0.00 Error in mindist[l] <- ndist[l] :
## NAs are not allowed in subscripted assignments
## Warning: predictions failed for Fold2.Rep4: lambda=0.00 Error in mindist[l] <- ndist[l] :
## NAs are not allowed in subscripted assignments
## Warning: predictions failed for Fold3.Rep4: lambda=0.00 Error in mindist[l] <- ndist[l] :
## NAs are not allowed in subscripted assignments
## Warning: predictions failed for Fold4.Rep4: lambda=0.00 Error in mindist[l] <- ndist[l] :
## NAs are not allowed in subscripted assignments
## Warning: predictions failed for Fold5.Rep4: lambda=0.00 Error in mindist[l] <- ndist[l] :
## NAs are not allowed in subscripted assignments
## Warning: predictions failed for Fold1.Rep5: lambda=0.00 Error in mindist[l] <- ndist[l] :
## NAs are not allowed in subscripted assignments
## Warning: predictions failed for Fold2.Rep5: lambda=0.00 Error in mindist[l] <- ndist[l] :
## NAs are not allowed in subscripted assignments
## Warning: predictions failed for Fold3.Rep5: lambda=0.00 Error in mindist[l] <- ndist[l] :
## NAs are not allowed in subscripted assignments
## Warning: predictions failed for Fold4.Rep5: lambda=0.00 Error in mindist[l] <- ndist[l] :
## NAs are not allowed in subscripted assignments
## Warning: predictions failed for Fold5.Rep5: lambda=0.00 Error in mindist[l] <- ndist[l] :
## NAs are not allowed in subscripted assignments
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.
## Warning in train.default(x, y, weights = w, ...): missing values found in
## aggregated results
#plot model results.
plot(fit5)
resamps <- resamples(list(tree = fit1,
RF1 = fit2,
GBM = fit3,
RF2 = fit4,
PDA = fit5))
bwplot(resamps)
The column labeled “Accuracy” is the overall agreement rate averaged over cross-validation iterations. The agreement standard deviation is also calculated from the cross-validation results. The column “Kappa” is Cohen’s (unweighted) Kappa statistic averaged across the resampling results.
resamps_no_tree <- resamples(list(RF1 = fit2,
GBM = fit3,
RF2 = fit4,
PDA = fit5))
bwplot(resamps_no_tree)
This suggests that while they are all pretty close, it seems like GBM (fit3) may be the best.
library('tidyr')
library('dplyr')
library('forcats')
library('ggplot2')
library('knitr')
library('caret')
#library('doParallel')
library('rpart')
library('rpart.plot')
library('mda')
## Loading required package: class
## Loaded mda 0.4-10
library('ranger')
## Warning: package 'ranger' was built under R version 3.5.2
##
## Attaching package: 'ranger'
## The following object is masked from 'package:rattle':
##
## importance
library('e1071')
## Warning: package 'e1071' was built under R version 3.5.2
##
## Attaching package: 'e1071'
## The following object is masked from 'package:mlr':
##
## impute
#Write code to get model predictions for the outcome on the training data..
predicted <- predict(fit3, data_train_subsetB)
confusion(data_train_subsetB$bioregion, predicted) #generates confusion matrix
## true
## predicted AK_2_BritColumb ChannelIsl_South GovtPt_2_Mexico
## AK_2_BritColumb 257 0 0
## ChannelIsl_South 0 25 0
## GovtPt_2_Mexico 0 0 1656
## OlyCstWA_2_SanFran 0 0 0
## SalishSea_WA 0 0 0
## SanFran_2_GovtPt 0 0 6
## true
## predicted OlyCstWA_2_SanFran SalishSea_WA SanFran_2_GovtPt
## AK_2_BritColumb 0 0 0
## ChannelIsl_South 0 0 0
## GovtPt_2_Mexico 1 0 0
## OlyCstWA_2_SanFran 2562 0 335
## SalishSea_WA 0 457 0
## SanFran_2_GovtPt 26 0 3894
# generating table with all results, with TRUE/FALSE accuracy column to compute ACC
train_data_results <- tibble("Actual" = data_train_subsetB$bioregion, "Predicted" = predicted)
train_accuracy <- measureACC(train_data_results$Predicted, train_data_results$Actual)
#use predicted and actual outcomes, make a table and compute accuracy.
library(knitr)
#install.packages("kableExtra")
library(kableExtra)
labels <- c("Predicted","AK_BrtClm","Salish_WA","OlyCstWA_SF","SF_GovtPt","Chnl_Isl_S","GovtPt_Mexico")
AK_2_BritColumb_col <- c("AK_BrtClm",257,0,0,0,0,0)
SalishSea_WA_col <- c("Salish_WA",0,457,0,0,0,0)
OlyCstWA_2_SanFran_col <- c("OlyCstWA_SF",0,0,2560,25,0,1)
SanFran_2_GovtPt <- c("SF_GovtPt",0,0,337,3895,0,0)
ChannelIsl_South_col <- c("Chnl_Isl_S",0,0,0,0,25,0)
GovtPt_2_Mexico_col <- c("GovtPt_Mexico",0,0,0,6,0,1656)
contingency_table <- data.frame(labels,AK_2_BritColumb_col,SalishSea_WA_col,OlyCstWA_2_SanFran_col,SanFran_2_GovtPt,ChannelIsl_South_col,GovtPt_2_Mexico_col)
colnames(contingency_table) <- c("","Actual","","","","","")
#Multipred_df$RMSE <- as.(Multipred_df$RMSE)
contingency_table
## Actual
## 1 Predicted AK_BrtClm Salish_WA OlyCstWA_SF SF_GovtPt Chnl_Isl_S
## 2 AK_BrtClm 257 0 0 0 0
## 3 Salish_WA 0 457 0 0 0
## 4 OlyCstWA_SF 0 0 2560 337 0
## 5 SF_GovtPt 0 0 25 3895 0
## 6 Chnl_Isl_S 0 0 0 0 25
## 7 GovtPt_Mexico 0 0 1 0 0
##
## 1 GovtPt_Mexico
## 2 0
## 3 0
## 4 0
## 5 6
## 6 0
## 7 1656
kable(contingency_table) %>%
kable_styling(full_width = F)
| Actual | ||||||
|---|---|---|---|---|---|---|
| Predicted | AK_BrtClm | Salish_WA | OlyCstWA_SF | SF_GovtPt | Chnl_Isl_S | GovtPt_Mexico |
| AK_BrtClm | 257 | 0 | 0 | 0 | 0 | 0 |
| Salish_WA | 0 | 457 | 0 | 0 | 0 | 0 |
| OlyCstWA_SF | 0 | 0 | 2560 | 337 | 0 | 0 |
| SF_GovtPt | 0 | 0 | 25 | 3895 | 0 | 6 |
| Chnl_Isl_S | 0 | 0 | 0 | 0 | 25 | 0 |
| GovtPt_Mexico | 0 | 0 | 1 | 0 | 0 | 1656 |
#ggsave(filename = "../../results/Categorical_Outcome_Modeling_results/12.PredvsActual.png",plot = kable(contingency_table), width = 5, height = 4)
null_accuracy_test_subsetB <- measureACC("SanFran_2_GovtPt", data_test_subsetB$bioregion)
null_accuracy_test_subsetB
## [1] 0.4262544
#copy and paste the code from above, but now do it for the test set.
#Write code to get model predictions for the outcome on the training data..
predicted_final <- predict(fit3, data_test_subsetB)
confusion(data_test_subsetB$bioregion, predicted_final) #generates confusion matrix
## true
## predicted AK_2_BritColumb ChannelIsl_South GovtPt_2_Mexico
## AK_2_BritColumb 109 0 0
## ChannelIsl_South 0 10 0
## GovtPt_2_Mexico 0 0 706
## OlyCstWA_2_SanFran 0 0 0
## SalishSea_WA 0 0 0
## SanFran_2_GovtPt 0 0 0
## true
## predicted OlyCstWA_2_SanFran SalishSea_WA SanFran_2_GovtPt
## AK_2_BritColumb 0 0 0
## ChannelIsl_South 0 0 0
## GovtPt_2_Mexico 3 0 0
## OlyCstWA_2_SanFran 1097 0 144
## SalishSea_WA 0 195 0
## SanFran_2_GovtPt 16 0 1666
# generating table with all results, with TRUE/FALSE accuracy column to compute ACC
test_data_results <- tibble("Actual" = data_test_subsetB$bioregion, "Predicted" = predicted_final)
test_accuracy <- measureACC(test_data_results$Predicted, test_data_results$Actual)
#use predicted and actual outcomes, make a table and compute accuracy.
library(knitr)
#install.packages("kableExtra")
library(kableExtra)
labels <- c("Predicted","AK_BrtClm","Salish_WA","OlyCstWA_SF","SF_GovtPt","Chnl_Isl_S","GovtPt_Mexico")
AK_2_BritColumb_col <- c("AK_BrtClm",109,0,0,0,0,0)
SalishSea_WA_col <- c("Salish_WA",0,195,0,0,0,0)
OlyCstWA_2_SanFran_col <- c("OlyCstWA_SF",0,0,1096,14,0,3)
SanFran_2_GovtPt <- c("SF_GovtPt",0,0,145,1668,0,0)
ChannelIsl_South_col <- c("Chnl_Isl_S",0,0,0,0,10,0)
GovtPt_2_Mexico_col <- c("GovtPt_Mexico",0,0,0,0,0,706)
contingency_table2 <- data.frame(labels,AK_2_BritColumb_col,SalishSea_WA_col,OlyCstWA_2_SanFran_col,SanFran_2_GovtPt,ChannelIsl_South_col,GovtPt_2_Mexico_col)
colnames(contingency_table) <- c("","Actual","","","","","")
contingency_table2
## labels AK_2_BritColumb_col SalishSea_WA_col
## 1 Predicted AK_BrtClm Salish_WA
## 2 AK_BrtClm 109 0
## 3 Salish_WA 0 195
## 4 OlyCstWA_SF 0 0
## 5 SF_GovtPt 0 0
## 6 Chnl_Isl_S 0 0
## 7 GovtPt_Mexico 0 0
## OlyCstWA_2_SanFran_col SanFran_2_GovtPt ChannelIsl_South_col
## 1 OlyCstWA_SF SF_GovtPt Chnl_Isl_S
## 2 0 0 0
## 3 0 0 0
## 4 1096 145 0
## 5 14 1668 0
## 6 0 0 10
## 7 3 0 0
## GovtPt_2_Mexico_col
## 1 GovtPt_Mexico
## 2 0
## 3 0
## 4 0
## 5 0
## 6 0
## 7 706
kable(contingency_table) %>%
kable_styling(full_width = F)
| Actual | ||||||
|---|---|---|---|---|---|---|
| Predicted | AK_BrtClm | Salish_WA | OlyCstWA_SF | SF_GovtPt | Chnl_Isl_S | GovtPt_Mexico |
| AK_BrtClm | 257 | 0 | 0 | 0 | 0 | 0 |
| Salish_WA | 0 | 457 | 0 | 0 | 0 | 0 |
| OlyCstWA_SF | 0 | 0 | 2560 | 337 | 0 | 0 |
| SF_GovtPt | 0 | 0 | 25 | 3895 | 0 | 6 |
| Chnl_Isl_S | 0 | 0 | 0 | 0 | 25 | 0 |
| GovtPt_Mexico | 0 | 0 | 1 | 0 | 0 | 1656 |
null_accuracy_test_subsetB<- round(null_accuracy_test_subsetB,4)
null_accuracy_train_subsetB<- round(null_accuracy_train_subsetB,4)
train_accuracy<- round(train_accuracy,4)
test_accuracy <- round(test_accuracy,4)
Data_cat <- c("Null_train","GBMmodel_train","Null_test","GBMmodel_test")
accuracy_cat <- c(null_accuracy_train_subsetB,train_accuracy,null_accuracy_test_subsetB,test_accuracy)
Final_Results_cat <- data.frame(Data_cat,accuracy_cat)
Final_Results_cat
## Data_cat accuracy_cat
## 1 Null_train 0.4259
## 2 GBMmodel_train 0.9601
## 3 Null_test 0.4263
## 4 GBMmodel_test 0.9587
Final_Results_cat_ordered <- Final_Results_cat %>%
mutate(Data_cat = factor(Data_cat, levels = c("Null_train","GBMmodel_train","Null_test","GBMmodel_test"))) %>%
arrange(Data_cat)
Final_Results_cat_ordered_plot <- Final_Results_cat_ordered %>% ggplot(aes(x = Data_cat, y=accuracy_cat, label=accuracy_cat)) +geom_label(stat='identity',fill=RMSE_colors, size=9, face="bold") + ylab("Accuracy (%)") + ylim(0.3,1.0) +
theme(title = element_text(size=12), axis.title.y = element_text(size=16),axis.title.x = element_blank(),axis.text.y = element_text(size=12), axis.text.x = element_text(size=12))
## Warning: Ignoring unknown parameters: face
Final_Results_cat_ordered_plot
ggsave(filename = "../../results/Categorical_Outcome_Modeling_results/14.final_results_cat.png",plot = Final_Results_cat_ordered_plot, width = 5, height = 4)
Accuracy went down, but only by ~0.1%, and its still very high.